package com.cmge.ad.spider.pic.leiren;

import java.util.List;

import org.springframework.util.StringUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import com.cmge.ad.model.Picture;
import com.cmge.ad.spider.pipeline.MysqlPicturePipeline;

/**
 * @desc 	搞笑吧雷人图 图片抓取 已爬 
 * 			不要开启多线程 该网站有安全狗会拦截请求 
 * 			http://www.hugao8.com/category/pic/leirentupian/page/2/
 * 
 * @author 	ljt
 * @time 	2014-12-30 下午7:51:34
 */
public class GaoXiaoBaLeiRenTuImagCrawl implements PageProcessor {

	private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

	public static final String URL_LIST = "/leirentupian/page/\\w+";

	// 列表最大值
	private int max = 59;

	@Override
	public void process(Page page) {
		if (page.getUrl().regex(URL_LIST).match()) {
			// 检索当前页面所有段子
			List<Selectable> cList = page.getHtml().xpath("//div[@class='bd']//ul[@class='clearfix pList']//li").nodes();
			if (null != cList && cList.size() > 0) {
				for (Selectable str : cList) {
					String url = str.xpath("//div[@class='picBox']//a[@class='pic']/@href").get().toString();
					if (!StringUtils.isEmpty(url)) {
						page.addTargetRequest(url);
					}
				}
			}

			// 当前页
			String url = page.getUrl().get();
			int current = 1;
			try {
				current = Integer.parseInt(url.substring(url.lastIndexOf("/") + 1, url.length()));
			} catch (Exception e) {
				e.printStackTrace();
			}

			System.out.println("current is " + current);
			if (current < max) {
				page.addTargetRequest("http://www.hugao8.com/category/pic/leirentupian/page/" + (current + 1));
			}
		} else {
			Picture pic = new Picture();
			String desc = page.getHtml().xpath("//div[@class='postmeta']/h1/text()").get().toString();
			String url = page.getHtml().xpath("//div[@id='infoMain']//img/@src").get().toString();
			if (!StringUtils.isEmpty(url)) {
				pic.setDesc(desc);
				pic.setMinImageUrl(url);
				pic.setMaxImageUrl(url);
				pic.setSource("gaoxiaoba_leirentu");
				page.putField("picture", pic);
			}
		}
	}

	@Override
	public Site getSite() {
		return site;
	}

	public static void main(String[] args) throws Exception {
		Spider qsSpider = Spider.create(new GaoXiaoBaLeiRenTuImagCrawl())
				.addUrl("http://www.hugao8.com/category/pic/leirentupian/page/1")
			// .addPipeline(new RedisPipeline())
			// .addPipeline(new JsonFilePipeline())
			// .addPipeline(new JsonPipeline())
				.addPipeline(new MysqlPicturePipeline())
				.thread(1);
		qsSpider.start();
	}

}
